--- title: "Global terrorism" date: 2019-05-25T10-24-00 output: md_document: default ---
library(tidyverse)
library(skimr)
library(maps)
library(gganimate)
library(plotly)
library(streamgraph)
data <- read_csv("data/globalterrorismdb_0718dist.csv", col_types = cols(
ransomamtus = col_number(),
ransomnote = col_character(),
attacktype3 = col_integer(),
attacktype3_txt = col_character(),
claimmode2 = col_integer(),
claimmode2_txt = col_character(),
claimmode3 = col_integer(),
claimmode3_txt = col_character(),
weaptype4 = col_integer(),
weaptype4_txt = col_character(),
weapsubtype4 = col_integer(),
weapsubtype4_txt = col_character(),
gname3 = col_character(),
gsubname3 = col_character(),
gsubname2 = col_character(),
ransompaidus = col_number(),
compclaim = col_integer()
))
# For world map
world <- map_data("world")
worldmap <- ggplot(data = world, aes(long, lat)) + borders("world") +
theme(panel.background = element_blank(),
axis.title = element_blank(),
axis.line.x = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank()) +
coord_fixed(1.2)
Data dimensions
dim(data)
## [1] 181691 135
First glance
colnames(data)
## [1] "eventid" "iyear" "imonth"
## [4] "iday" "approxdate" "extended"
## [7] "resolution" "country" "country_txt"
## [10] "region" "region_txt" "provstate"
## [13] "city" "latitude" "longitude"
## [16] "specificity" "vicinity" "location"
## [19] "summary" "crit1" "crit2"
## [22] "crit3" "doubtterr" "alternative"
## [25] "alternative_txt" "multiple" "success"
## [28] "suicide" "attacktype1" "attacktype1_txt"
## [31] "attacktype2" "attacktype2_txt" "attacktype3"
## [34] "attacktype3_txt" "targtype1" "targtype1_txt"
## [37] "targsubtype1" "targsubtype1_txt" "corp1"
## [40] "target1" "natlty1" "natlty1_txt"
## [43] "targtype2" "targtype2_txt" "targsubtype2"
## [46] "targsubtype2_txt" "corp2" "target2"
## [49] "natlty2" "natlty2_txt" "targtype3"
## [52] "targtype3_txt" "targsubtype3" "targsubtype3_txt"
## [55] "corp3" "target3" "natlty3"
## [58] "natlty3_txt" "gname" "gsubname"
## [61] "gname2" "gsubname2" "gname3"
## [64] "gsubname3" "motive" "guncertain1"
## [67] "guncertain2" "guncertain3" "individual"
## [70] "nperps" "nperpcap" "claimed"
## [73] "claimmode" "claimmode_txt" "claim2"
## [76] "claimmode2" "claimmode2_txt" "claim3"
## [79] "claimmode3" "claimmode3_txt" "compclaim"
## [82] "weaptype1" "weaptype1_txt" "weapsubtype1"
## [85] "weapsubtype1_txt" "weaptype2" "weaptype2_txt"
## [88] "weapsubtype2" "weapsubtype2_txt" "weaptype3"
## [91] "weaptype3_txt" "weapsubtype3" "weapsubtype3_txt"
## [94] "weaptype4" "weaptype4_txt" "weapsubtype4"
## [97] "weapsubtype4_txt" "weapdetail" "nkill"
## [100] "nkillus" "nkillter" "nwound"
## [103] "nwoundus" "nwoundte" "property"
## [106] "propextent" "propextent_txt" "propvalue"
## [109] "propcomment" "ishostkid" "nhostkid"
## [112] "nhostkidus" "nhours" "ndays"
## [115] "divert" "kidhijcountry" "ransom"
## [118] "ransomamt" "ransomamtus" "ransompaid"
## [121] "ransompaidus" "ransomnote" "hostkidoutcome"
## [124] "hostkidoutcome_txt" "nreleased" "addnotes"
## [127] "scite1" "scite2" "scite3"
## [130] "dbsource" "INT_LOG" "INT_IDEO"
## [133] "INT_MISC" "INT_ANY" "related"
Deal with latitude and longtitude
data %>% select(longitude, latitude) %>% skim()
## Skim summary statistics
## n obs: 181691
## n variables: 2
##
## -- Variable type:numeric --------------------------------------------------------
## variable missing complete n mean sd p0 p25 p50
## latitude 4556 177135 181691 23.5 18.57 -53.15 11.51 31.47
## longitude 4557 177134 181691 -458.7 2e+05 -8.6e+07 4.55 43.25
## p75 p100 hist
## 34.69 74.63 <U+2581><U+2581><U+2581><U+2583><U+2583><U+2587><U+2582><U+2581>
## 68.71 179.37 <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2587>
data %>% filter(is.na(latitude)) %>% select(location, city, provstate, country_txt, specificity) %>% filter(is.na(specificity))
## # A tibble: 6 x 5
## location city provstate country_txt specificity
## <chr> <chr> <chr> <chr> <dbl>
## 1 <NA> Dhupgu~ West Ben~ India NA
## 2 Wazir Akhbar Khan neighborhood Kabul Kabul Afghanistan NA
## 3 The kidnapping took place in M~ Mogadi~ Banaadir Somalia NA
## 4 The attack took place in the L~ Muang Yala Thailand NA
## 5 Near Shahrah-e-Faisal Karachi Sindh Pakistan NA
## 6 Under a bridge near Tirumangal~ Alampa~ Tamil Na~ India NA
data %>% map_df(~ str_detect(.,"Unknown" )) %>% summarise_all(sum, na.rm = TRUE)
## # A tibble: 1 x 135
## eventid iyear imonth iday approxdate extended resolution country
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0
## # ... with 127 more variables: country_txt <int>, region <int>,
## # region_txt <int>, provstate <int>, city <int>, latitude <int>,
## # longitude <int>, specificity <int>, vicinity <int>, location <int>,
## # summary <int>, crit1 <int>, crit2 <int>, crit3 <int>, doubtterr <int>,
## # alternative <int>, alternative_txt <int>, multiple <int>,
## # success <int>, suicide <int>, attacktype1 <int>,
## # attacktype1_txt <int>, attacktype2 <int>, attacktype2_txt <int>,
## # attacktype3 <int>, attacktype3_txt <int>, targtype1 <int>,
## # targtype1_txt <int>, targsubtype1 <int>, targsubtype1_txt <int>,
## # corp1 <int>, target1 <int>, natlty1 <int>, natlty1_txt <int>,
## # targtype2 <int>, targtype2_txt <int>, targsubtype2 <int>,
## # targsubtype2_txt <int>, corp2 <int>, target2 <int>, natlty2 <int>,
## # natlty2_txt <int>, targtype3 <int>, targtype3_txt <int>,
## # targsubtype3 <int>, targsubtype3_txt <int>, corp3 <int>,
## # target3 <int>, natlty3 <int>, natlty3_txt <int>, gname <int>,
## # gsubname <int>, gname2 <int>, gsubname2 <int>, gname3 <int>,
## # gsubname3 <int>, motive <int>, guncertain1 <int>, guncertain2 <int>,
## # guncertain3 <int>, individual <int>, nperps <int>, nperpcap <int>,
## # claimed <int>, claimmode <int>, claimmode_txt <int>, claim2 <int>,
## # claimmode2 <int>, claimmode2_txt <int>, claim3 <int>,
## # claimmode3 <int>, claimmode3_txt <int>, compclaim <int>,
## # weaptype1 <int>, weaptype1_txt <int>, weapsubtype1 <int>,
## # weapsubtype1_txt <int>, weaptype2 <int>, weaptype2_txt <int>,
## # weapsubtype2 <int>, weapsubtype2_txt <int>, weaptype3 <int>,
## # weaptype3_txt <int>, weapsubtype3 <int>, weapsubtype3_txt <int>,
## # weaptype4 <int>, weaptype4_txt <int>, weapsubtype4 <int>,
## # weapsubtype4_txt <int>, weapdetail <int>, nkill <int>, nkillus <int>,
## # nkillter <int>, nwound <int>, nwoundus <int>, nwoundte <int>,
## # property <int>, propextent <int>, propextent_txt <int>,
## # propvalue <int>, ...
longitude value appear to be incorrect at -8.6e+07. By compare the location with actual coordinate on google, it is obvious that a decimal separator is missing. The code below will fix that by changing it to -86.185896
data %>% filter(longitude == min(data$longitude, na.rm = T))
## # A tibble: 1 x 135
## eventid iyear imonth iday approxdate extended resolution country
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1.98e11 1982 12 24 <NA> 0 <NA> 145
## # ... with 127 more variables: country_txt <chr>, region <dbl>,
## # region_txt <chr>, provstate <chr>, city <chr>, latitude <dbl>,
## # longitude <dbl>, specificity <dbl>, vicinity <dbl>, location <chr>,
## # summary <chr>, crit1 <dbl>, crit2 <dbl>, crit3 <dbl>, doubtterr <dbl>,
## # alternative <dbl>, alternative_txt <chr>, multiple <dbl>,
## # success <dbl>, suicide <dbl>, attacktype1 <dbl>,
## # attacktype1_txt <chr>, attacktype2 <dbl>, attacktype2_txt <chr>,
## # attacktype3 <int>, attacktype3_txt <chr>, targtype1 <dbl>,
## # targtype1_txt <chr>, targsubtype1 <dbl>, targsubtype1_txt <chr>,
## # corp1 <chr>, target1 <chr>, natlty1 <dbl>, natlty1_txt <chr>,
## # targtype2 <dbl>, targtype2_txt <chr>, targsubtype2 <dbl>,
## # targsubtype2_txt <chr>, corp2 <chr>, target2 <chr>, natlty2 <dbl>,
## # natlty2_txt <chr>, targtype3 <dbl>, targtype3_txt <chr>,
## # targsubtype3 <dbl>, targsubtype3_txt <chr>, corp3 <chr>,
## # target3 <chr>, natlty3 <dbl>, natlty3_txt <chr>, gname <chr>,
## # gsubname <chr>, gname2 <chr>, gsubname2 <chr>, gname3 <chr>,
## # gsubname3 <chr>, motive <chr>, guncertain1 <dbl>, guncertain2 <dbl>,
## # guncertain3 <lgl>, individual <dbl>, nperps <dbl>, nperpcap <dbl>,
## # claimed <dbl>, claimmode <dbl>, claimmode_txt <chr>, claim2 <dbl>,
## # claimmode2 <int>, claimmode2_txt <chr>, claim3 <lgl>,
## # claimmode3 <int>, claimmode3_txt <chr>, compclaim <int>,
## # weaptype1 <dbl>, weaptype1_txt <chr>, weapsubtype1 <dbl>,
## # weapsubtype1_txt <chr>, weaptype2 <dbl>, weaptype2_txt <chr>,
## # weapsubtype2 <dbl>, weapsubtype2_txt <chr>, weaptype3 <dbl>,
## # weaptype3_txt <chr>, weapsubtype3 <dbl>, weapsubtype3_txt <chr>,
## # weaptype4 <int>, weaptype4_txt <chr>, weapsubtype4 <int>,
## # weapsubtype4_txt <chr>, weapdetail <chr>, nkill <dbl>, nkillus <dbl>,
## # nkillter <dbl>, nwound <dbl>, nwoundus <dbl>, nwoundte <dbl>,
## # property <dbl>, propextent <dbl>, propextent_txt <chr>,
## # propvalue <dbl>, ...
data[data$eventid == 198212240004, "longitude"] <- -86.185896
Deal with missing longitude and latitude by using the country location as the incident location. The country location is obtained by using Mapquest API.
baseurl <- "http://www.mapquestapi.com/geocoding/v1/address?"
country <- data %>%
filter(is.na(longitude)) %>%
distinct(country_txt)
country <- country %>% mutate(location = str_replace_all(country_txt, " ", "+"))
country$location[country$location == "North+Yemen"] <- "Yemen"
country$location[country$location == "South+Yemen"] <- "Yemen"
country$location[country$location == "West+Germany+(FRG)"] <- "Germany"
country$location[country$location == "Yugoslavia"] <- "Croatia"
country$location[country$location == "Zaire"] <- "Democratic+Republic+of+the+Congo"
country$location[country$location == "Soviet+Union"] <- "Russia"
country$location[country$location == "West+Bank+and+Gaza+Strip"] <- "Gaza+Strip"
country$location[country$location == "Rhodesia"] <- "Zimbabwe"
country$location[country$location == "Czechoslovakia"] <- "Czech+Republic"
url <- str_c(baseurl, "key=", KEY, "&location=", country$location, "&outFormat=csv")
missing <- url %>%
map_dfr( ~ read_csv(.) %>%
select(Lat, Lng, GeocodeQuality, GeocodeQualityCode, Country) %>%
filter(GeocodeQuality == "COUNTRY"))
country <- bind_cols(country,missing)
temp <- data %>%
filter(is.na(longitude)) %>%
select(eventid, country_txt) %>%
left_join(country) %>%
select(eventid, Lng, Lat)
data <- data %>% left_join(temp, by = "eventid") %>%
mutate(longitude = ifelse(is.na(longitude), Lng, longitude)) %>%
mutate(latitude = ifelse(is.na(latitude), Lat, latitude))
Missing date and month in the data were recorded as NA, change them to 1. Then create a new column called idate which combined year, month and date together
data$iday[data$iday == 0] <- 1
data$imonth[data$imonth == 0] <- 1
data$idate <- data %>% unite(date, iyear, imonth, iday, sep = "-") %>%
mutate(date = lubridate::ymd(date)) %>%
pull(date)
Group name contain non UTF-8 characters. The following code will fix it
data <- data %>%
mutate_at(vars(gname, gname2, gname3), function(x){gsub('[^ -~]', '', x)})
Create a label for leaflet map by making new column called popmap
data$popmap <- data %>%
mutate(attacktype2_txt = ifelse(is.na(attacktype2_txt)," ", str_c(", ", attacktype2_txt)),
attacktype3_txt = ifelse(is.na(attacktype3_txt)," ", str_c(", ", attacktype3_txt)),
weaptype2_txt = ifelse(is.na(weaptype2_txt)," ", str_c(", ", weaptype2_txt)),
weaptype3_txt = ifelse(is.na(weaptype3_txt)," ", str_c(", ", weaptype3_txt)),
targtype2_txt = ifelse(is.na(targtype2_txt)," ", str_c(", ", targtype2_txt)),
targtype3_txt = ifelse(is.na(targtype3_txt)," ", str_c(", ", targtype3_txt)),
gname2 = ifelse(is.na(gname2)," ", str_c(", ", gname2)),
gname3 = ifelse(is.na(gname3)," ", str_c(", ", gname3))) %>%
mutate(popmap = str_c("Country: ", country_txt, " <br/> ",
"Date: ", idate, " <br/> ",
"Attack type: ", attacktype1_txt, attacktype2_txt, attacktype3_txt, " <br/> ",
"Weapon: ", weaptype1_txt, weaptype2_txt, weaptype3_txt, " <br/> ",
"Target: ", targtype1_txt, targtype2_txt, targtype3_txt, " <br/> ",
"Group responsible: ", gname, gname2, gname3, " <br/> ",
"Casualty: ", str_replace_na(nkill), " <br/> ",
"Injured: ", str_replace_na(nwound), " <br/> ",
"Property damage: ", str_replace_na(propextent_txt))) %>% pull(popmap)
data %>% count(iyear, region_txt) %>% ggplot() + geom_line(aes(iyear, n, color = fct_reorder2(region_txt, n, iyear))) + scale_x_continuous(breaks = 1970:2017) + theme(axis.text.x = element_text(angle = 45))

(data %>% count(iyear, region_txt) %>% ggplot() + geom_area(aes(iyear, n, fill = region_txt)) + scale_x_continuous(breaks = 1970:2017) + theme(axis.text.x = element_text(angle = 45))) %>%
ggplotly()
## Number of incidents
(data %>% count(iyear, country_txt) %>%
ggplot() +
geom_line(aes(iyear, n, color = country_txt, text = iyear)) +
theme(legend.position = "none")) %>% ggplotly()
## Warning: Ignoring unknown aesthetics: text
(data %>% count(iyear, country_txt) %>%
ggplot() +
geom_area(aes(iyear, n, fill = country_txt, text = iyear)) +
theme(legend.position = "none")) %>% ggplotly()
## Warning: Ignoring unknown aesthetics: text
#worldmap + geom_point(data = data , aes(x = longitude, y = latitude), alpha = 0.1, size = 0.5, color = "red") +
# transition_states(iyear) +
#labs(title = "Year: {closest_state}")
(worldmap +
geom_point(data = data , aes(x = longitude, y = latitude, frame = iyear), alpha = 0.1, size = 0.5, color = "red")) %>%
ggplotly()
## Warning: Ignoring unknown aesthetics: frame